*----------------------------------------------------------------------------------------------------------	* 											   									
* PROGRAMMED BY:	Linea Hasager																			
* DESCRIPTION:		Clean first language spoken (mother tongue) in 1999-2003 DKDB data
* CREATED:			Nov. 20, 2019																		   	
* LAST MODIFIED:	May 31, 2022														       				
*----------------------------------------------------------------------------------------------------------	*

clear 
set more off

// Directories
global input 		"L:\Workdata\707455\Papers\ABFHP_1999reform\Code\ReStat\Input"
global output		"L:\Workdata\707455\Papers\ABFHP_1999reform\Code\ReStat\Output"


*--------------------------------------------------------------------------------------------------------	



*--------------------------------------------------------------------------------------------------------	
*		PART 1: CLEANING PRIMARY LANGUAGE SPOKEN PRIOR TO IMMIGRATION
*--------------------------------------------------------------------------------------------------------	

/* LOAD DATA */
use "${input}\_DKDB1999_2003", replace

preserve
*DROP MEANINGSLESS LANGUAGE CODES
drop if inlist(sprogkode,"(t)","(u)","---", "X","?", " ")
drop if inlist(substr(sprogkode,1,1),"0","1","2","3","4","5","6","7","8")
drop if inlist(substr(sprogkode,1,1),"9")

*COUNT NUMBER OF TIMES A LANGUAGE IS OBSERVED PER INDIVIDUAL
bysort pnr sprogkode: egen n=count(sprogkode)

*SORT, MOST OBSERVED LANGUAGE PER PERSON FIRST
gsort pnr nation sprogkode -n

*KEEP ONE LANGUAGE PER PERSON, MOST FREQUENT IF MULTIPLE
duplicates drop pnr sprogkode , force

bysort pnr: gen  nn=_n
keep if nn==1
drop n nn
tempfile language
save `language'
restore

*ADD CLEANED VERSION OF LANGUAGE AND THE CORRESPONDING NATIONALITY
drop nation sprogkode
merge m:1 pnr using `language'
drop _merge

order pnr year nation sprogkode


*GENERATE DUMMIES FOR MOST COMMON LANGUAGES
*NOTE THAT DARI AND FARSI ARE BOTH PERSIAN LANGUAGES
gen afghan=0
replace afghan=1		if inlist(sprogkode,"AFG")
gen albanian=0
replace albanian=1		if inlist(sprogkode,"ALB")
gen amharic=0
replace amharic=1		if inlist(sprogkode,"AMH")
gen arabic=0
replace arabic=1		if inlist(sprogkode,"ARA")
gen armenian=0
replace armenian=1		if inlist(sprogkode,"ARE","ARM")
gen assyrian=0
replace assyrian=1		if inlist(sprogkode,"ASY")
gen azeri=0
replace azeri=1			if inlist(sprogkode,"AZE")
gen berber=0
replace berber=1		if inlist(sprogkode,"BER")
gen bosnian=0
replace bosnian=1		if inlist(sprogkode,"BOS","BO","BSN")
gen bulgarian=0
replace bulgarian=1		if inlist(sprogkode,"BUL")
gen dari=0
replace dari=1			if inlist(sprogkode,"DAR")
gen english=0
replace english=1		if inlist(sprogkode,"ENG")
gen estonian=0
replace estonian=1		if inlist(sprogkode,"EST")
gen farsi=0
replace farsi=1			if inlist(sprogkode,"FAR")
gen filipino=0
replace filipino=1		if inlist(sprogkode,"FIL")
gen finnish=0
replace finnish=1		if inlist(sprogkode,"FIN")
gen french=0
replace french=1		if inlist(sprogkode,"FRA","FR")
gen greek=0
replace greek=1			if inlist(sprogkode,"GRA")
gen greenlandic=0
replace greenlandic=1	if inlist(sprogkode,"GRO")
gen hebraic=0
replace hebraic=1		if inlist(sprogkode,"HEB")
gen hindu=0
replace hindu=1			if inlist(sprogkode,"HIN")
gen dutch=0
replace dutch=1			if inlist(sprogkode,"HOL")
gen indonesian=0
replace indonesian=1	if inlist(sprogkode,"IND")
gen iraqi=0
replace iraqi=1			if inlist(sprogkode,"IRA","IRK","IRQ")
gen icelandic=0
replace icelandic=1		if inlist(sprogkode,"ISL,IS")
gen italian=0
replace italian=1		if inlist(sprogkode,"ITA")
gen japanese=0
replace japanese=1		if inlist(sprogkode,"JAP")
gen chinese=0
replace chinese=1		if inlist(sprogkode,"KIN")
gen kirundi=0
replace kirundi=1		if inlist(sprogkode,"KIR")
gen korean=0
replace korean=1		if inlist(sprogkode,"KOR")
gen croatian=0
replace croatian=1		if inlist(sprogkode,"KRO")
gen kurdish=0
replace kurdish=1		if inlist(sprogkode,"KUR")
gen latvian=0
replace latvian=1		if inlist(sprogkode,"LET")
gen lithuanian=0
replace lithuanian=1	if inlist(sprogkode,"LIT")
gen luganda=0
replace luganda=1		if inlist(sprogkode,"LUG")
gen macedonian=0
replace macedonian=1	if inlist(sprogkode,"MAK")
*CHINA, CAMBODJA, MALAYSIA, TAIWAN
gen mandarin=0
replace mandarin=1		if inlist(sprogkode,"MAN") & inlist(nation,"CHN","KHM","MYS","TWN")
*GAMBIA, GUINEA-BISSAU, LIBERIA, MOZAMBIQUE, SENEGAL, SIERRA LEONE
gen mandingo=0
replace mandingo=1		if inlist(sprogkode,"MAN") & inlist(nation,"GMB","GIN","LBR","MOZ","SEN","SLE")
gen pakistani=0
replace pakistani=1		if inlist(sprogkode,"PAK")
gen pashto=0
replace pashto=1		if inlist(sprogkode,"PAS")
gen persian=0
replace persian=1		if inlist(sprogkode,"PER")
gen polish=0
replace polish=1		if inlist(sprogkode,"POL")
gen portugese=0
replace portugese=1		if inlist(sprogkode,"POR")
gen punjabi=0
replace punjabi=1		if inlist(sprogkode,"PUN","Pun")
*Romas
gen romani=0
replace romani=1		if inlist(sprogkode,"ROM")
*Romanians
gen romanian=0
replace romanian=1		if inlist(sprogkode,"RUM")
gen russian=0
replace russian=1		if inlist(sprogkode,"RUS")
gen serbian=0
replace serbian=1		if inlist(sprogkode,"SER")
gen sinhala=0
replace sinhala=1		if inlist(sprogkode,"SIN") & inlist(nation,"LKA","SGP")
gen slovenian=0
replace slovenian=1		if inlist(sprogkode,"SLO","SLV") & inlist(nation,"SVN")
gen slovakian=0
replace slovakian=1		if inlist(sprogkode,"SLO","SLV") & inlist(nation,"SVK")
gen somali=0
replace somali=1		if inlist(sprogkode,"SOM")
gen spanish=0
replace spanish=1		if inlist(sprogkode,"SPA")
gen swedish=0
replace swedish=1		if inlist(sprogkode,"SVE","SWE")
gen tagalog=0
replace tagalog=1		if inlist(sprogkode,"TAG")
gen tamil=0
replace tamil=1			if inlist(sprogkode,"TAM")
gen thai=0
replace thai=1			if inlist(sprogkode,"THA","TH","Tha")
gen tigrinya=0
replace tigrinya=1		if inlist(sprogkode,"TIN")
gen czech=0
replace czech=1			if inlist(sprogkode,"TJE")
gen turkish=0
*Iranians speak turkmen, while iraqis speak turkish
replace turkish=1		if inlist(sprogkode,"TYR") | (inlist(sprogkode,"TUR") & !inlist(nation,"IRN","TUR"))
gen turkmen=0
replace turkmen=1		if inlist(sprogkode,"TUR") & inlist(nation,"TUR")
gen twi=0
replace twi=1			if inlist(sprogkode,"TWI","TW")
gen german=0
replace german=1		if inlist(sprogkode,"TYS")
gen ukrainian=0
replace ukrainian=1		if inlist(sprogkode,"UKR")
gen hungarian=0
replace hungarian=1		if inlist(sprogkode,"UNG")
gen urdu=0
replace urdu=1			if inlist(sprogkode,"URD")
gen vietnamese=0
replace vietnamese=1	if inlist(sprogkode,"VIE","VNM")
gen wolof=0
replace wolof=1			if inlist(sprogkode,"WOL")

*LABELS
foreach language of varlist afghan-wolof {
label var `language' "Speaks `language'"
}



*--------------------------------------------------------------------------------------------------------	
*		2) GENERATE SHARE OF REFUGEES IN INTEGRATION PROGRAM PARTICIPATING IN EACH CLASS (HOLDID)
*--------------------------------------------------------------------------------------------------------	

merge m:1 pnr using "$input\Refugees1997_", keepusing(pnr tilladelsesdato)
drop if _merge==2
gen refugee=0
replace refugee=1 if _merge!=1
label var refugee "Refugee"
drop _merge

*DUMMY FOR BEING A REFUGEE IN THE INTEGRATION PROGRAM (ARRIVAL FROM 1999), OTHER REFUGEES WITH KNOW ADMISSION DATE=0
gen integration_program=0 if !missing(tilladelsesdato) & refugee==1
replace integration_program=1 if year(tilladelsesdato) > 1998 & !missing(tilladelsesdato) & refugee==1


*GENERATE RATIO OF REFUGEES IN INTEGRATION PROGRAM TO REFUGEES NOT IN PROGRAM BY CLASSES
bysort holdid: egen treated_to_control=mean(integration_program)


label var treated_to_control "Share of refugees in the class under the integration program (of all refugees with known admission dates)"


*DELETE UNNECESSARY VARIABLES
drop integration_program 

*DISTRIBUTION OF TREATED REFUGEES (IN INTEGRATION PROGRAM) IN CLASSES
local treated_to_control_txt Share of refugees in the class under the integration program
local var treated_to_control
preserve
duplicates drop holdid, force
hist `var', xtitle(``var'_txt', size(small)) graphregion(fcolor(white) margin(medlarge)) mcolor(gs10) ylab(, format(%10.2fc)) xlab(, format(%10.2fc))
restore


*KEEP RELEVANT VARIBALES
keep pnr year afghan-wolof t_timer_kuho start_dato

*----------------------------------
*			3) EXPORT DATA
*----------------------------------

*SAVE IN OLD STATA-FORMAT TO BE ABLE TO IMPORT TO SAS
compress
saveold "$input\dkdb1999_2003.dta", version(13) replace


*DELETE TEMPORARY DATA
erase "$input\_dkdb1999_2003.dta"

